# Title: Measuring Interethnic Marriage in Africa
# Author: Branden Bohrnsen, bohrnsen@umich.edu
# Date: 1/30/2026
# Description: Estimate intermarriage rates using the exposure-adjusted index.

library(tidyverse)
library(here)

weighted_se <- function(x, w, na.rm = TRUE) {
    if (na.rm) {
        complete_cases <- !is.na(x) & !is.na(w)
        x <- x[complete_cases]
        w <- w[complete_cases]
    }
    n <- length(x)
    if (n <= 1) return(NA)
    
    weighted_mean <- sum(w * x) / sum(w)
    weighted_var <- sum(w * (x - weighted_mean)^2) / sum(w)
    effective_n <- (sum(w))^2 / sum(w^2)

    sqrt(weighted_var / effective_n)
}

marriages <- read_csv(here("data", "clean", "sample.csv")) %>%
  mutate(Urban = Urbanization >= 0.5) %>%
  mutate(across(where(is.factor), as.character)) %>%
  mutate(Intergroup = Ethnicity_Head != Ethnicity_Spouse) %>%
  mutate(Interblock = EthnicBlock_Head != EthnicBlock_Spouse) %>%
  filter(FullReside == T) %>%
  filter(Polygamous_Head == "No more than one wife linked via SPLOC")

hhi_by_level <- read_csv(here("out", "inter", "hhi_by_level.csv"))
hhi_by_decade <- read_csv(here("out", "inter", "hhi_by_decade.csv"))
hhi_by_const_decade <- read_csv(here("out", "inter", "hhi_by_const_decade.csv"))
n_gc_ethnicity <- read_csv(here("out", "inter", "n_gc_ethnicity.csv"))
n_gc_ethnic_block <- read_csv(here("out", "inter", "n_gc_ethnic_block.csv"))
hhi_quintiles_by_decade <- read_csv(here("out", "inter", "hhi_quintiles_by_decade.csv"))
hhi_by_const_time_invariant <- read_csv(here("out", "inter", "hhi_by_const_time_invariant.csv"))
hhi_quintiles_time_invariant <- read_csv(here("out", "inter", "hhi_quintiles_time_invariant.csv"))

r_gc_ethnicity <- marriages %>%
  filter(!is.na(Ethnicity_Head) & !is.na(Const) & !is.na(DecadeMarried)) %>%
  group_by(Ethnicity_Head, Const, District, DecadeMarried) %>%
  summarise(
    r_gc = mean(Intergroup, na.rm = TRUE),
    n_marriages_gc = n(),
    .groups = "drop"
  )

r_gc_ethnic_block <- marriages %>%
  filter(!is.na(EthnicBlock_Head) & !is.na(Const) & !is.na(DecadeMarried)) %>%
  group_by(EthnicBlock_Head, Const, District, DecadeMarried) %>%
  summarise(
    r_gc = mean(Interblock, na.rm = TRUE),
    n_marriages_gc = n(),
    .groups = "drop"
  )

observed_intermarriage_group <- r_gc_ethnicity %>%
  inner_join(
    n_gc_ethnicity,
    by = c("Ethnicity_Head" = "Ethnicity", "Const", "District", "DecadeMarried" = "Decade")
  ) %>%
  group_by(Const, District, DecadeMarried) %>%
  summarise(
    observed_rate_group = weighted.mean(r_gc, n_gc, na.rm = TRUE),
    total_marriages = sum(n_marriages_gc, na.rm = TRUE),
    .groups = "drop"
  )

observed_intermarriage_block <- r_gc_ethnic_block %>%
  inner_join(
    n_gc_ethnic_block,
    by = c("EthnicBlock_Head" = "EthnicBlock", "Const", "District", "DecadeMarried" = "Decade")
  ) %>%
  group_by(Const, District, DecadeMarried) %>%
  summarise(
    observed_rate_block = weighted.mean(r_gc, n_gc, na.rm = TRUE),
    total_marriages = sum(n_marriages_gc, na.rm = TRUE),
    .groups = "drop"
  )

observed_intermarriage_group <- observed_intermarriage_group %>%
  inner_join(
    hhi_by_const_decade,
    by = c("Const", "District", "DecadeMarried" = "Decade")
  ) %>%
  select(Const, District, DecadeMarried, observed_rate_group, total_marriages, N_Ethnicity, N_EthnicBlock)

observed_intermarriage_block <- observed_intermarriage_block %>%
  inner_join(
    hhi_by_const_decade,
    by = c("Const", "District", "DecadeMarried" = "Decade")
  ) %>%
  select(Const, District, DecadeMarried, observed_rate_block, total_marriages, N_Ethnicity, N_EthnicBlock)

n_c_ethnicity <- n_gc_ethnicity %>%
  group_by(Const, District, Decade) %>%
  summarise(n_c = sum(n_gc, na.rm = TRUE), .groups = "drop")

p_gc_ethnicity <- n_gc_ethnicity %>%
  inner_join(n_c_ethnicity, by = c("Const", "District", "Decade")) %>%
  mutate(p_gc = n_gc / n_c)

combined_data_group <- r_gc_ethnicity %>%
  inner_join(
    p_gc_ethnicity,
    by = c("Ethnicity_Head" = "Ethnicity", "Const", "District", "DecadeMarried" = "Decade")
  )

I_group_share_ethnicity <- combined_data_group %>%
  filter(p_gc < 1) %>%
  mutate(
    component = (r_gc / (1 - p_gc)) * p_gc,
    expectation = 1 - p_gc
  ) %>%
  group_by(Const, District, DecadeMarried) %>%
  summarise(
    I_group_share = sum(component, na.rm = TRUE),
    avg_observed_rate = weighted.mean(r_gc, w = n_gc, na.rm = TRUE),
    avg_expectation = weighted.mean(expectation, w = n_gc, na.rm = TRUE),
    total_n_group = sum(n_gc, na.rm = TRUE),
    .groups = "drop"
  )

n_c_ethnic_block <- n_gc_ethnic_block %>%
  group_by(Const, District, Decade) %>%
  summarise(n_c = sum(n_gc, na.rm = TRUE), .groups = "drop")

p_gc_ethnic_block <- n_gc_ethnic_block %>%
  inner_join(n_c_ethnic_block, by = c("Const", "District", "Decade")) %>%
  mutate(p_gc = n_gc / n_c)

combined_data_block <- r_gc_ethnic_block %>%
  inner_join(
    p_gc_ethnic_block,
    by = c("EthnicBlock_Head" = "EthnicBlock", "Const", "District", "DecadeMarried" = "Decade")
  )

I_group_share_block <- combined_data_block %>%
  filter(p_gc < 1) %>%
  mutate(
    component = (r_gc / (1 - p_gc)) * p_gc,
    expectation = 1 - p_gc
  ) %>%
  group_by(Const, District, DecadeMarried) %>%
  summarise(
    I_group_share = sum(component, na.rm = TRUE),
    avg_observed_rate = weighted.mean(r_gc, w = n_gc, na.rm = TRUE),
    avg_expectation = weighted.mean(expectation, w = n_gc, na.rm = TRUE),
    total_n_block = sum(n_gc, na.rm = TRUE),
    .groups = "drop"
  )

I_group_share_ethnicity_with_marriages <- I_group_share_ethnicity %>%
  left_join(
    observed_intermarriage_group %>% select(Const, District, DecadeMarried, total_marriages),
    by = c("Const", "District", "DecadeMarried")
  )

I_group_share_block_with_marriages <- I_group_share_block %>%
  left_join(
    observed_intermarriage_block %>% select(Const, District, DecadeMarried, total_marriages),
    by = c("Const", "District", "DecadeMarried")
  )

national_estimates_group <- I_group_share_ethnicity_with_marriages %>%
  group_by(DecadeMarried) %>%
  summarise(
    I_group_share_national = weighted.mean(I_group_share, w = total_n_group, na.rm = TRUE),
    I_group_share_national_se = weighted_se(I_group_share, w = total_n_group, na.rm = TRUE),
    avg_observed_rate_national = weighted.mean(avg_observed_rate, w = total_n_group, na.rm = TRUE),
    avg_observed_rate_national_se = weighted_se(avg_observed_rate, w = total_n_group, na.rm = TRUE),
    avg_expectation_national = weighted.mean(avg_expectation, w = total_n_group, na.rm = TRUE),
    avg_expectation_national_se = weighted_se(avg_expectation, w = total_n_group, na.rm = TRUE),
    total_n_census = sum(total_n_group, na.rm = TRUE),
    total_n_marriages = sum(total_marriages, na.rm = TRUE),
    .groups = "drop"
  ) %>% 
  mutate(Group_Type = "Ethnic Group")

national_estimates_block <- I_group_share_block_with_marriages %>%
  group_by(DecadeMarried) %>%
  summarise(
    I_group_share_national = weighted.mean(I_group_share, w = total_n_block, na.rm = TRUE),
    I_group_share_national_se = weighted_se(I_group_share, w = total_n_block, na.rm = TRUE),
    avg_observed_rate_national = weighted.mean(avg_observed_rate, w = total_n_block, na.rm = TRUE),
    avg_observed_rate_national_se = weighted_se(avg_observed_rate, w = total_n_block, na.rm = TRUE),
    avg_expectation_national = weighted.mean(avg_expectation, w = total_n_block, na.rm = TRUE),
    avg_expectation_national_se = weighted_se(avg_expectation, w = total_n_block, na.rm = TRUE),
    total_n_census = sum(total_n_block, na.rm = TRUE),
    total_n_marriages = sum(total_marriages, na.rm = TRUE),
    .groups = "drop"
  ) %>% 
  mutate(Group_Type = "Ethnic Block")

national_estimates <- rbind(national_estimates_group, national_estimates_block)

national_estimates_group_overall <- I_group_share_ethnicity_with_marriages %>%
  summarise(
    I_group_share_national = weighted.mean(I_group_share, w = total_n_group, na.rm = TRUE),
    I_group_share_national_se = weighted_se(I_group_share, w = total_n_group, na.rm = TRUE),
    avg_observed_rate_national = weighted.mean(avg_observed_rate, w = total_n_group, na.rm = TRUE),
    avg_observed_rate_national_se = weighted_se(avg_observed_rate, w = total_n_group, na.rm = TRUE),
    avg_expectation_national = weighted.mean(avg_expectation, w = total_n_group, na.rm = TRUE),
    avg_expectation_national_se = weighted_se(avg_expectation, w = total_n_group, na.rm = TRUE),
    total_n_census = sum(total_n_group, na.rm = TRUE),
    total_n_marriages = sum(total_marriages, na.rm = TRUE),
    .groups = "drop"
  ) %>% 
  mutate(Group_Type = "Ethnic Group")

national_estimates_block_overall <- I_group_share_block_with_marriages %>%
  summarise(
    I_group_share_national = weighted.mean(I_group_share, w = total_n_block, na.rm = TRUE),
    I_group_share_national_se = weighted_se(I_group_share, w = total_n_block, na.rm = TRUE),
    avg_observed_rate_national = weighted.mean(avg_observed_rate, w = total_n_block, na.rm = TRUE),
    avg_observed_rate_national_se = weighted_se(avg_observed_rate, w = total_n_block, na.rm = TRUE),
    avg_expectation_national = weighted.mean(avg_expectation, w = total_n_block, na.rm = TRUE),
    avg_expectation_national_se = weighted_se(avg_expectation, w = total_n_block, na.rm = TRUE),
    total_n_census = sum(total_n_block, na.rm = TRUE),
    total_n_marriages = sum(total_marriages, na.rm = TRUE),
    .groups = "drop"
  ) %>% 
  mutate(Group_Type = "Ethnic Block")

national_estimates_overall <- rbind(national_estimates_group_overall, national_estimates_block_overall)

urban_rural_lookup <- marriages %>%
  select(Const, District, DecadeMarried, Urban) %>%
  distinct()

I_group_share_ethnicity_with_urban <- I_group_share_ethnicity_with_marriages %>%
  left_join(urban_rural_lookup, by = c("Const", "District", "DecadeMarried"))

I_group_share_block_with_urban <- I_group_share_block_with_marriages %>%
  left_join(urban_rural_lookup, by = c("Const", "District", "DecadeMarried"))

national_estimates_group_urban_rural <- I_group_share_ethnicity_with_urban %>%
  filter(!is.na(Urban)) %>%
  group_by(Urban) %>%
  summarise(
    I_group_share_national = weighted.mean(I_group_share, w = total_n_group, na.rm = TRUE),
    I_group_share_national_se = weighted_se(I_group_share, w = total_n_group, na.rm = TRUE),
    avg_observed_rate_national = weighted.mean(avg_observed_rate, w = total_n_group, na.rm = TRUE),
    avg_observed_rate_national_se = weighted_se(avg_observed_rate, w = total_n_group, na.rm = TRUE),
    avg_expectation_national = weighted.mean(avg_expectation, w = total_n_group, na.rm = TRUE),
    avg_expectation_national_se = weighted_se(avg_expectation, w = total_n_group, na.rm = TRUE),
    total_n_census = sum(total_n_group, na.rm = TRUE),
    total_n_marriages = sum(total_marriages, na.rm = TRUE),
    .groups = "drop"
  ) %>% 
  mutate(Group_Type = "Ethnic Group")

national_estimates_block_urban_rural <- I_group_share_block_with_urban %>%
  filter(!is.na(Urban)) %>%
  group_by(Urban) %>%
  summarise(
    I_group_share_national = weighted.mean(I_group_share, w = total_n_block, na.rm = TRUE),
    I_group_share_national_se = weighted_se(I_group_share, w = total_n_block, na.rm = TRUE),
    avg_observed_rate_national = weighted.mean(avg_observed_rate, w = total_n_block, na.rm = TRUE),
    avg_observed_rate_national_se = weighted_se(avg_observed_rate, w = total_n_block, na.rm = TRUE),
    avg_expectation_national = weighted.mean(avg_expectation, w = total_n_block, na.rm = TRUE),
    avg_expectation_national_se = weighted_se(avg_expectation, w = total_n_block, na.rm = TRUE),
    total_n_census = sum(total_n_block, na.rm = TRUE),
    total_n_marriages = sum(total_marriages, na.rm = TRUE),
    .groups = "drop"
  ) %>% 
  mutate(Group_Type = "Ethnic Block")

national_estimates_urban_rural <- rbind(national_estimates_group_urban_rural, national_estimates_block_urban_rural)

national_estimates_group_urban_rural_decade <- I_group_share_ethnicity_with_urban %>%
  group_by(Urban, DecadeMarried) %>%
  summarise(
    I_group_share_national = weighted.mean(I_group_share, w = total_n_group, na.rm = TRUE),
    I_group_share_national_se = weighted_se(I_group_share, w = total_n_group, na.rm = TRUE),
    avg_observed_rate_national = weighted.mean(avg_observed_rate, w = total_n_group, na.rm = TRUE),
    avg_observed_rate_national_se = weighted_se(avg_observed_rate, w = total_n_group, na.rm = TRUE),
    avg_expectation_national = weighted.mean(avg_expectation, w = total_n_group, na.rm = TRUE),
    avg_expectation_national_se = weighted_se(avg_expectation, w = total_n_group, na.rm = TRUE),
    total_n_census = sum(total_n_group, na.rm = TRUE),
    total_n_marriages = sum(total_marriages, na.rm = TRUE),
    .groups = "drop"
  ) %>% 
  mutate(Group_Type = "Ethnic Group")

national_estimates_block_urban_rural_decade <- I_group_share_block_with_urban %>%
  group_by(Urban, DecadeMarried) %>%
  summarise(
    I_group_share_national = weighted.mean(I_group_share, w = total_n_block, na.rm = TRUE),
    I_group_share_national_se = weighted_se(I_group_share, w = total_n_block, na.rm = TRUE),
    avg_observed_rate_national = weighted.mean(avg_observed_rate, w = total_n_block, na.rm = TRUE),
    avg_observed_rate_national_se = weighted_se(avg_observed_rate, w = total_n_block, na.rm = TRUE),
    avg_expectation_national = weighted.mean(avg_expectation, w = total_n_block, na.rm = TRUE),
    avg_expectation_national_se = weighted_se(avg_expectation, w = total_n_block, na.rm = TRUE),
    total_n_census = sum(total_n_block, na.rm = TRUE),
    total_n_marriages = sum(total_marriages, na.rm = TRUE),
    .groups = "drop"
  ) %>% 
  mutate(Group_Type = "Ethnic Block")

national_estimates_urban_rural_decade <- rbind(national_estimates_group_urban_rural_decade, national_estimates_block_urban_rural_decade)

province_lookup <- marriages %>%
  select(Const, District, DecadeMarried, Province) %>%
  distinct()

I_group_share_ethnicity_with_province <- I_group_share_ethnicity_with_marriages %>%
  left_join(province_lookup, by = c("Const", "District", "DecadeMarried"))

I_group_share_block_with_province <- I_group_share_block_with_marriages %>%
  left_join(province_lookup, by = c("Const", "District", "DecadeMarried"))

national_estimates_group_province <- I_group_share_ethnicity_with_province %>%
  filter(!is.na(Province)) %>%
  group_by(Province) %>%
  summarise(
    I_group_share_national = weighted.mean(I_group_share, w = total_n_group, na.rm = TRUE),
    I_group_share_national_se = weighted_se(I_group_share, w = total_n_group, na.rm = TRUE),
    avg_observed_rate_national = weighted.mean(avg_observed_rate, w = total_n_group, na.rm = TRUE),
    avg_observed_rate_national_se = weighted_se(avg_observed_rate, w = total_n_group, na.rm = TRUE),
    avg_expectation_national = weighted.mean(avg_expectation, w = total_n_group, na.rm = TRUE),
    avg_expectation_national_se = weighted_se(avg_expectation, w = total_n_group, na.rm = TRUE),
    total_n_census = sum(total_n_group, na.rm = TRUE),
    total_n_marriages = sum(total_marriages, na.rm = TRUE),
    .groups = "drop"
  ) %>% 
  mutate(Group_Type = "Ethnic Group")

national_estimates_block_province <- I_group_share_block_with_province %>%
  filter(!is.na(Province)) %>%
  group_by(Province) %>%
  summarise(
    I_group_share_national = weighted.mean(I_group_share, w = total_n_block, na.rm = TRUE),
    I_group_share_national_se = weighted_se(I_group_share, w = total_n_block, na.rm = TRUE),
    avg_observed_rate_national = weighted.mean(avg_observed_rate, w = total_n_block, na.rm = TRUE),
    avg_observed_rate_national_se = weighted_se(avg_observed_rate, w = total_n_block, na.rm = TRUE),
    avg_expectation_national = weighted.mean(avg_expectation, w = total_n_block, na.rm = TRUE),
    avg_expectation_national_se = weighted_se(avg_expectation, w = total_n_block, na.rm = TRUE),
    total_n_census = sum(total_n_block, na.rm = TRUE),
    total_n_marriages = sum(total_marriages, na.rm = TRUE),
    .groups = "drop"
  ) %>% 
  mutate(Group_Type = "Ethnic Block")

national_estimates_province <- rbind(national_estimates_group_province, national_estimates_block_province)

national_estimates_group_district <- I_group_share_ethnicity_with_marriages %>%
  filter(!is.na(District)) %>%
  group_by(District, DecadeMarried) %>%
  summarise(
    I_group_share_national = weighted.mean(I_group_share, w = total_n_group, na.rm = TRUE),
    I_group_share_national_se = weighted_se(I_group_share, w = total_n_group, na.rm = TRUE),
    avg_observed_rate_national = weighted.mean(avg_observed_rate, w = total_n_group, na.rm = TRUE),
    avg_observed_rate_national_se = weighted_se(avg_observed_rate, w = total_n_group, na.rm = TRUE),
    avg_expectation_national = weighted.mean(avg_expectation, w = total_n_group, na.rm = TRUE),
    avg_expectation_national_se = weighted_se(avg_expectation, w = total_n_group, na.rm = TRUE),
    total_n_census = sum(total_n_group, na.rm = TRUE),
    total_n_marriages = sum(total_marriages, na.rm = TRUE),
    .groups = "drop"
  ) %>% 
  mutate(Group_Type = "Ethnic Group")

national_estimates_block_district <- I_group_share_block_with_marriages %>%
  filter(!is.na(District)) %>%
  group_by(District, DecadeMarried) %>%
  summarise(
    I_group_share_national = weighted.mean(I_group_share, w = total_n_block, na.rm = TRUE),
    I_group_share_national_se = weighted_se(I_group_share, w = total_n_block, na.rm = TRUE),
    avg_observed_rate_national = weighted.mean(avg_observed_rate, w = total_n_block, na.rm = TRUE),
    avg_observed_rate_national_se = weighted_se(avg_observed_rate, w = total_n_block, na.rm = TRUE),
    avg_expectation_national = weighted.mean(avg_expectation, w = total_n_block, na.rm = TRUE),
    avg_expectation_national_se = weighted_se(avg_expectation, w = total_n_block, na.rm = TRUE),
    total_n_census = sum(total_n_block, na.rm = TRUE),
    total_n_marriages = sum(total_marriages, na.rm = TRUE),
    .groups = "drop"
  ) %>% 
  mutate(Group_Type = "Ethnic Block")

national_estimates_district <- rbind(national_estimates_group_district, national_estimates_block_district)

national_estimates_district <- national_estimates_district %>%
  filter(DecadeMarried == "2001-2010", Group_Type == "Ethnic Block") %>%
  select(District, DecadeMarried, Group_Type, total_n_census, I_group_share_national, avg_observed_rate_national)

if (!dir.exists(here("out", "final"))) {
  dir.create(here("out", "final"), recursive = TRUE)
}

write_csv(national_estimates, here("out", "final", "marriages_by_decade.csv"))
write_csv(national_estimates_overall, here("out", "final", "marriages_overall.csv"))
write_csv(national_estimates_urban_rural, here("out", "final", "marriages_urban_rural.csv"))
write_csv(national_estimates_province, here("out", "final", "marriages_province.csv"))
write_csv(national_estimates_urban_rural_decade, here("out", "final", "marriages_urban_rural_by_decade.csv"))
write_csv(national_estimates_district, here("out", "final", "marriages_by_district.csv"))
